Fix pagetable pinning logic for xen/i386 kernels. The pin
authorkaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Mon, 7 Nov 2005 17:14:45 +0000 (18:14 +0100)
committerkaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Mon, 7 Nov 2005 17:14:45 +0000 (18:14 +0100)
flag is now associated with the pgd rather than the mm -- this
avoids a race where a pgd is allocated from the pgd_cache but,
before it gets associated with an mm, the kernel suspends itself.
At this point the kernel mappings will not get rewritten when the
kernel is resumed, and the system will fail.

A further advantage is that the code is slightly simpler and less
invasive (no changes to mm_context for example).

Signed-off-by: Keir Fraser <keir@xensource.com>
linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c
linux-2.6-xen-sparse/arch/xen/i386/mm/init.c
linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c
linux-2.6-xen-sparse/arch/xen/kernel/reboot.c
linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu.h
linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h
linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h

index 905dcd3928a2ae939972bbee0930b5b1b4910330..5ef1ed10fcb6a7fccf1d621e48510b8447a6cba4 100644 (file)
@@ -18,7 +18,6 @@
 #include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
-#include <asm/mmu_context.h>
 
 #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
 static void flush_ldt(void *null)
@@ -101,19 +100,14 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
        struct mm_struct * old_mm;
        int retval = 0;
 
-       memset(&mm->context, 0, sizeof(mm->context));
        init_MUTEX(&mm->context.sem);
+       mm->context.size = 0;
        old_mm = current->mm;
        if (old_mm && old_mm->context.size > 0) {
                down(&old_mm->context.sem);
                retval = copy_ldt(&mm->context, &old_mm->context);
                up(&old_mm->context.sem);
        }
-       if (retval == 0) {
-               spin_lock(&mm_unpinned_lock);
-               list_add(&mm->context.unpinned, &mm_unpinned);
-               spin_unlock(&mm_unpinned_lock);
-       }
        return retval;
 }
 
@@ -134,11 +128,6 @@ void destroy_context(struct mm_struct *mm)
                        kfree(mm->context.ldt);
                mm->context.size = 0;
        }
-       if (!mm->context.pinned) {
-               spin_lock(&mm_unpinned_lock);
-               list_del(&mm->context.unpinned);
-               spin_unlock(&mm_unpinned_lock);
-       }
 }
 
 static int read_ldt(void __user * ptr, unsigned long bytecount)
index b5f700501251591b9867eaa89c1b68fcfae89185..bed3831a2358eea01b4f015bc9c0a57ff34663d0 100644 (file)
@@ -376,7 +376,6 @@ static void __init pagetable_init (void)
                __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
        }
 
-       init_mm.context.pinned = 1;
        kernel_physical_mapping_init(pgd_base);
        remap_numa_kva();
 
@@ -689,6 +688,8 @@ void __init mem_init(void)
 #ifndef CONFIG_SMP
        zap_low_mappings();
 #endif
+
+       set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
 }
 
 kmem_cache_t *pgd_cache;
index 503d48842ab7cc43f9e62e1567d7e0ae80859c3b..3af8e92144a1917335fb8c9fbd5c53c7abdd5a2f 100644 (file)
@@ -27,6 +27,9 @@
 #include <asm-xen/foreign_page.h>
 #include <asm/hypervisor.h>
 
+static void __pgd_pin(pgd_t *pgd);
+static void __pgd_unpin(pgd_t *pgd);
+
 void show_mem(void)
 {
        int total = 0, reserved = 0;
@@ -299,6 +302,8 @@ void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 {
        unsigned long flags; /* can be called from interrupt context */
 
+       BUG_ON(test_bit(PG_pinned, &virt_to_page(pgd)->flags));
+
        if (HAVE_SHARED_KERNEL_PMD)
                return;
 
@@ -312,6 +317,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
        int i = 0;
        pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
 
+       BUG_ON(test_bit(PG_pinned, &virt_to_page(pgd)->flags));
+
        if (PTRS_PER_PMD == 1 || !pgd)
                return pgd;
 
@@ -351,15 +358,9 @@ out_oom:
 void pgd_free(pgd_t *pgd)
 {
        int i;
-       pte_t *ptep = virt_to_ptep(pgd);
 
-       if (!pte_write(*ptep)) {
-               xen_pgd_unpin(__pa(pgd));
-               BUG_ON(HYPERVISOR_update_va_mapping(
-                       (unsigned long)pgd,
-                       pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
-                       0));
-       }
+       if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
+               __pgd_unpin(pgd);
 
        /* in the PAE case user pgd entries are overwritten before usage */
        if (PTRS_PER_PMD > 1) {
@@ -441,10 +442,7 @@ void make_pages_writable(void *va, unsigned int nr)
 }
 #endif /* CONFIG_XEN_SHADOW_MODE */
 
-LIST_HEAD(mm_unpinned);
-DEFINE_SPINLOCK(mm_unpinned_lock);
-
-static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
+static inline void pgd_walk_set_prot(void *pt, pgprot_t flags)
 {
        struct page *page = virt_to_page(pt);
        unsigned long pfn = page_to_pfn(page);
@@ -456,103 +454,111 @@ static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
                pfn_pte(pfn, flags), 0));
 }
 
-static void mm_walk(struct mm_struct *mm, pgprot_t flags)
+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
 {
-       pgd_t       *pgd;
-       pud_t       *pud;
-       pmd_t       *pmd;
-       pte_t       *pte;
-       int          g,u,m;
+       pgd_t *pgd = pgd_base;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       int    g, u, m;
 
-       pgd = mm->pgd;
        for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
                if (pgd_none(*pgd))
                        continue;
                pud = pud_offset(pgd, 0);
                if (PTRS_PER_PUD > 1) /* not folded */
-                       mm_walk_set_prot(pud,flags);
+                       pgd_walk_set_prot(pud,flags);
                for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
                        if (pud_none(*pud))
                                continue;
                        pmd = pmd_offset(pud, 0);
                        if (PTRS_PER_PMD > 1) /* not folded */
-                               mm_walk_set_prot(pmd,flags);
+                               pgd_walk_set_prot(pmd,flags);
                        for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
                                if (pmd_none(*pmd))
                                        continue;
                                pte = pte_offset_kernel(pmd,0);
-                               mm_walk_set_prot(pte,flags);
+                               pgd_walk_set_prot(pte,flags);
                        }
                }
        }
+
+       BUG_ON(HYPERVISOR_update_va_mapping(
+               (unsigned long)pgd_base,
+               pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+               UVMF_TLB_FLUSH));
+}
+
+static void __pgd_pin(pgd_t *pgd)
+{
+       pgd_walk(pgd, PAGE_KERNEL_RO);
+       xen_pgd_pin(__pa(pgd));
+       set_bit(PG_pinned, &virt_to_page(pgd)->flags);
+}
+
+static void __pgd_unpin(pgd_t *pgd)
+{
+       xen_pgd_unpin(__pa(pgd));
+       pgd_walk(pgd, PAGE_KERNEL);
+       clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
 }
 
 void mm_pin(struct mm_struct *mm)
 {
-    spin_lock(&mm->page_table_lock);
-
-    mm_walk(mm, PAGE_KERNEL_RO);
-    BUG_ON(HYPERVISOR_update_va_mapping(
-        (unsigned long)mm->pgd,
-        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
-        UVMF_TLB_FLUSH));
-    xen_pgd_pin(__pa(mm->pgd));
-    mm->context.pinned = 1;
-    spin_lock(&mm_unpinned_lock);
-    list_del(&mm->context.unpinned);
-    spin_unlock(&mm_unpinned_lock);
-
-    spin_unlock(&mm->page_table_lock);
+       spin_lock(&mm->page_table_lock);
+       __pgd_pin(mm->pgd);
+       spin_unlock(&mm->page_table_lock);
 }
 
 void mm_unpin(struct mm_struct *mm)
 {
-    spin_lock(&mm->page_table_lock);
-
-    xen_pgd_unpin(__pa(mm->pgd));
-    BUG_ON(HYPERVISOR_update_va_mapping(
-        (unsigned long)mm->pgd,
-        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0));
-    mm_walk(mm, PAGE_KERNEL);
-    xen_tlb_flush();
-    mm->context.pinned = 0;
-    spin_lock(&mm_unpinned_lock);
-    list_add(&mm->context.unpinned, &mm_unpinned);
-    spin_unlock(&mm_unpinned_lock);
-
-    spin_unlock(&mm->page_table_lock);
+       spin_lock(&mm->page_table_lock);
+       __pgd_unpin(mm->pgd);
+       spin_unlock(&mm->page_table_lock);
 }
 
 void mm_pin_all(void)
 {
-    while (!list_empty(&mm_unpinned))  
-       mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
-                         context.unpinned));
+       struct page *page;
+       for (page = pgd_list; page; page = (struct page *)page->index) {
+               if (!test_bit(PG_pinned, &page->flags))
+                       __pgd_pin((pgd_t *)page_address(page));
+       }
 }
 
 void _arch_exit_mmap(struct mm_struct *mm)
 {
-    struct task_struct *tsk = current;
+       struct task_struct *tsk = current;
 
-    task_lock(tsk);
+       task_lock(tsk);
 
-    /*
-     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
-     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
-     */
-    if ( tsk->active_mm == mm )
-    {
-        tsk->active_mm = &init_mm;
-        atomic_inc(&init_mm.mm_count);
+       /*
+        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+        */
+       if (tsk->active_mm == mm) {
+               tsk->active_mm = &init_mm;
+               atomic_inc(&init_mm.mm_count);
 
-        switch_mm(mm, &init_mm, tsk);
+               switch_mm(mm, &init_mm, tsk);
 
-        atomic_dec(&mm->mm_count);
-        BUG_ON(atomic_read(&mm->mm_count) == 0);
-    }
+               atomic_dec(&mm->mm_count);
+               BUG_ON(atomic_read(&mm->mm_count) == 0);
+       }
 
-    task_unlock(tsk);
+       task_unlock(tsk);
 
-    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
-        mm_unpin(mm);
+       if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
+           (atomic_read(&mm->mm_count) == 1))
+               mm_unpin(mm);
 }
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
index d017cf9620905970e8cf77a79955e404275d8616..89c20054508d6381e8b3d0010dd8be6dedf9033a 100644 (file)
@@ -129,8 +129,8 @@ static int __do_suspend(void *ignore)
        preempt_disable();
 
 #ifdef __i386__
-       mm_pin_all();
        kmem_cache_shrink(pgd_cache);
+       mm_pin_all();
 #endif
 
        __cli();
index b628b46f3be794476ec2511afecb6c27f5bc1a8b..32987b80b163c9061aac2ce787ecbe8bb04ae8f7 100644 (file)
@@ -12,13 +12,8 @@ typedef struct {
        int size;
        struct semaphore sem;
        void *ldt;
-       unsigned pinned:1;
-       struct list_head unpinned;
 } mm_context_t;
 
-extern struct list_head mm_unpinned;
-extern spinlock_t mm_unpinned_lock;
-
 /* mm/memory.c:exit_mmap hook */
 extern void _arch_exit_mmap(struct mm_struct *mm);
 #define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
index c5567bc9b304b085c5d4558ffb9b727919051ff3..129f79aba04a52953b9f619a6b0bc3d18d1d3108 100644 (file)
@@ -53,7 +53,7 @@ static inline void switch_mm(struct mm_struct *prev,
        struct mmuext_op _op[2], *op = _op;
 
        if (likely(prev != next)) {
-               if (!next->context.pinned)
+               if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
                        mm_pin(next);
 
                /* stop flush ipis for the previous mm */
index f559b93a5f990921497755a19db3696f28b9d9ab..98e802babc78f4e8fc935e846aceffd3a3e5a32f 100644 (file)
@@ -7,12 +7,15 @@
 #include <linux/mm.h>          /* for struct page */
 #include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
 
+/* Is this pagetable pinned? */
+#define PG_pinned      PG_arch_1
+
 #define pmd_populate_kernel(mm, pmd, pte) \
                set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
 
 #define pmd_populate(mm, pmd, pte)                                     \
 do {                                                                   \
-       if (unlikely((mm)->context.pinned)) {                           \
+       if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) {     \
                if (!PageHighMem(pte))                                  \
                        BUG_ON(HYPERVISOR_update_va_mapping(            \
                          (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\